Ask a home buyer to describe their dream house, and they probably won’t begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition’s dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.
Deliverables:
Your group is to turn in a paper that is no more that 7 pages long (without the appendix). Please put your code in the appendix, but any graphs and tables in the body of the paper.
Sample Format
Required deliverables in the complete report. The format of your paper (headers, sections, etc) is flexible although should contain the following information:
Introduction
Data Description
(Where did the data come from? How big is it? How many observations? Where can we find out more? What are the specific variables that we need to know to understand with respect to your analysis?)
Analysis Question 1:
Restatement of Problem
Specify the Model
Checking Assumptions
Residual Plots
Influential point analysis (Cook’s D and Leverage)
Make sure and address each assumption.
Comparing Competing Models
adj R2
Interval CVPress
Parameter Interpretation
Interpretation
Confidence Intervals
Conclusion
A short summary of the analysis.
Analysis Question 2
Restatement of Problem
Model Selection
Type of Selection
Stepwise
Forward
Backward
CUSTOM
Checking Assumptions
Residual Plots
Influential point analysis (Cook’s D and Leverage)
Make sure and address each assumption
Comparing Competing Models
Adj R2
Interval CVPress
Kaggle Score
Conclusion: A short summary of the analysis.
Appendix
Well commented SAS Code for Analysis 1 and 2
Rubric:
Presentation (30%):
Organized paper with title, headings, subheadings, etc.
Labeled plots, figures, tables and charts.
Every plot, figure, table and chart included is referenced in the paper and vice versa.
No spelling or grammatical errors.
Analysis Question 1: (35%)
Analysis Question 2: (35 %)
setwd(data_dir)
homes <- read.csv("train.csv", stringsAsFactors = FALSE)
setwd(home_dir)
names(homes) <- tolower(names(homes))
for (i in 2:(length(homes)))
{
if (class(homes[,i]) == "character")
{
homes[,i] <- factor (homes[,i])
}
}
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... remove outliers ... more than 5 sigma from mean value
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
lst <- length(homes) - 1 # sale price is (currently) last column
for (i in 2 : lst)
{
if(class(homes[,i]) == "integer" || class(homes[,i]) == "numeric")
{
homes[,i][which(scale(homes[,i]) > 5)] <- NA
homes[,i][which(scale(homes[,i]) < -5)] <- NA
}
}
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... create a few new columns
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
dates <- paste(homes$yrsold, sprintf("%02d", homes$mosold), "01")
homes$sale_date <- as.Date(dates, "%Y %m %d")
homes$total_baths <- homes$fullbath + homes$halfbath / 2.0
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... scale each column independently
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# for (i in 2 : length(homes))
# {
# if(class(homes[,i]) == "integer" || class(homes[,i]) == "numeric")
# {
# homes[,i] <- scale(homes[,i])
# }
# }
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... make some plots for numberic variables... linear, log_x, log_y, log_xy ...
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# pdf ("homes_train_plots.pdf", width = 10, height = 7)
par (mfrow = c (2, 3))
for (i in 2:(length(homes)))
{
if(class(homes[,i]) == "integer" || class(homes[,i]) == "numeric" || class(homes[,i]) == "matrix")
{
plot (homes[,i], main = (names(homes[i])))
hist(homes[,i])
plot(log(homes$saleprice) ~ homes[,i])
}
}
par (mfrow = c (1, 1))
for (i in 2:(length(homes)))
{
if(class(homes[,i]) == "factor")
{
p <- ggplot(homes, aes(x = homes[,i], y = log(saleprice), fill = homes[,i])) + geom_boxplot()
p + ggtitle(names(homes[i]))
print(p)
}
}
plot(homes$saleprice ~ homes$sale_date)
# dev.off()
for (i in 2:(length(homes)))
{
if(class(homes[,i]) == "integer" || class(homes[,i]) == "numeric" || class(homes[,i]) == "matrix")
{
fit <- lm(homes$saleprice ~ homes[,i])
print(sprintf(" ... %3d : %20s | r^2 = %8.3f | p-value = %12.4e",
i, names(homes[i]), summary(fit)$r.squared, summary(fit)$coefficients[,4][2] ))
}
}
## [1] " ... 2 : mssubclass | r^2 = 0.007 | p-value = 1.2665e-03"
## [1] " ... 4 : lotfrontage | r^2 = 0.145 | p-value = 9.6021e-43"
## [1] " ... 5 : lotarea | r^2 = 0.140 | p-value = 1.1041e-49"
## [1] " ... 18 : overallqual | r^2 = 0.626 | p-value = 2.1857e-313"
## [1] " ... 19 : overallcond | r^2 = 0.006 | p-value = 2.9124e-03"
## [1] " ... 20 : yearbuilt | r^2 = 0.273 | p-value = 2.9902e-103"
## [1] " ... 21 : yearremodadd | r^2 = 0.257 | p-value = 3.1649e-96"
## [1] " ... 27 : masvnrarea | r^2 = 0.214 | p-value = 2.1244e-77"
## [1] " ... 35 : bsmtfinsf1 | r^2 = 0.166 | p-value = 2.4724e-59"
## [1] " ... 37 : bsmtfinsf2 | r^2 = 0.003 | p-value = 4.8072e-02"
## [1] " ... 38 : bsmtunfsf | r^2 = 0.046 | p-value = 1.1830e-16"
## [1] " ... 39 : totalbsmtsf | r^2 = 0.417 | p-value = 6.3610e-173"
## [1] " ... 44 : x1stflrsf | r^2 = 0.395 | p-value = 6.7032e-161"
## [1] " ... 45 : x2ndflrsf | r^2 = 0.102 | p-value = 5.7643e-36"
## [1] " ... 46 : lowqualfinsf | r^2 = 0.003 | p-value = 2.7800e-02"
## [1] " ... 47 : grlivarea | r^2 = 0.519 | p-value = 1.9399e-233"
## [1] " ... 48 : bsmtfullbath | r^2 = 0.052 | p-value = 1.5503e-18"
## [1] " ... 49 : bsmthalfbath | r^2 = 0.000 | p-value = 5.7466e-01"
## [1] " ... 50 : fullbath | r^2 = 0.314 | p-value = 1.2365e-121"
## [1] " ... 51 : halfbath | r^2 = 0.081 | p-value = 1.6505e-28"
## [1] " ... 52 : bedroomabvgr | r^2 = 0.029 | p-value = 7.2242e-11"
## [1] " ... 53 : kitchenabvgr | r^2 = 0.018 | p-value = 1.9184e-07"
## [1] " ... 55 : totrmsabvgrd | r^2 = 0.285 | p-value = 2.7723e-108"
## [1] " ... 57 : fireplaces | r^2 = 0.218 | p-value = 6.1415e-80"
## [1] " ... 60 : garageyrblt | r^2 = 0.237 | p-value = 8.7051e-83"
## [1] " ... 62 : garagecars | r^2 = 0.410 | p-value = 2.4986e-169"
## [1] " ... 63 : garagearea | r^2 = 0.389 | p-value = 5.2650e-158"
## [1] " ... 67 : wooddecksf | r^2 = 0.107 | p-value = 9.8439e-38"
## [1] " ... 68 : openporchsf | r^2 = 0.115 | p-value = 1.9621e-40"
## [1] " ... 69 : enclosedporch | r^2 = 0.020 | p-value = 4.9036e-08"
## [1] " ... 70 : x3ssnporch | r^2 = 0.000 | p-value = 9.2444e-01"
## [1] " ... 71 : screenporch | r^2 = 0.007 | p-value = 1.6782e-03"
## [1] " ... 72 : poolarea | r^2 = 0.000 | p-value = NA"
## [1] " ... 76 : miscval | r^2 = 0.000 | p-value = 4.2297e-01"
## [1] " ... 77 : mosold | r^2 = 0.002 | p-value = 7.6128e-02"
## [1] " ... 78 : yrsold | r^2 = 0.001 | p-value = 2.6941e-01"
## [1] " ... 81 : saleprice | r^2 = 1.000 | p-value = 0.0000e+00"
## [1] " ... 83 : total_baths | r^2 = 0.358 | p-value = 2.8268e-142"
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... Columns to remove - based on visual inspection
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
homes_subset <- subset(homes, select = -c(
id,
mssubclass,
street,
alley,
utilities,
condition2,
roofmatl,
centralair,
bsmtfinsf2,
lowqualfinsf,
bsmthalfbath,
kitchenabvgr,
x3ssnporch,
screenporch,
garagequal,
garagecond,
paveddrive,
poolarea,
poolqc,
miscval,
mosold,
yrsold))
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... Impute NAs to functional value
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
for (i in 2 : (length(homes_subset)))
{
if(class(homes_subset[,i]) == "integer" || class(homes_subset[,i]) == "numeric" || class(homes_subset[,i]) == "matrix")
{
homes_subset[,i][is.na (homes_subset[,i])] <- min (homes_subset[,i], na.rm = TRUE)
}
}
for (i in 2:(length(homes_subset)))
{
if(class(homes_subset[,i]) == "character")
{
homes_subset[,i][is.na (homes_subset[,i])] <- "None"
}
}
homes_subset$log_saleprice <- log(homes_subset$saleprice)
homes_subset <- subset(homes_subset, select = -c(saleprice, sale_date))
sas_dir <- "~/sas/SASUniversityEdition/myfolders/"
setwd(sas_dir)
write.csv (homes_subset, file = "training_set_cleaned.csv", row.names = FALSE)